module.ex

/**
 * 图片代理地址
 * @type {string}
 */
const proxyImg = "https://uu.ccfish.run/angentImg/?pic_url=";

/**
 * 定义一些正则表达式,用来提取 html中需要的数据
 * @type {RegExp}
 */
const srcMath = /src=[\\'\\\"]?([^\\'\\\"]*)[\\'\\\"]?/;
const hrefMath = /href=[\\'\\\"]?([^\\'\\\"]*)[\\'\\\"]?/;
const pMath = /<p(([\s\S])*?)<\/p>/ig;
const iMath = /<i(([\s\S])*?)<\/i>/ig;
const spanMath = /<span(([\s\S])*?)<\/span>/ig;
const aMath = /<a(([\s\S])*?)<\/a>/ig;

/**
 * cnbeta list新闻页面
 * 处理html为json
 * @param html
 * @returns {Array} 返回 JSON_list
 */
export const $list_htmlToJson = function (html) {

    //定义返回数据的数组
    var dataList = [];

    //把有数据的那部分html截取出来
    html = html.substring(0, html.lastIndexOf("pager"));
    html = html.substring(0, html.lastIndexOf("</ul>"));
    html = html.substring(html.indexOf("info_list") + 11);

    //循环html li标签,得到一个数据集合
    while (html.indexOf("<li>") != -1) {

        //获得数据列表里每一个li标签里的html,调用方法提取li标签里的数据
        let liHtml = html.substring(html.indexOf("<li>") + 4, html.indexOf("</li>"))
        var d = getLiTab(liHtml);

        //将数据插入到数组
        if (d) dataList.push(d);

        //每循环一次把提取过的html li标签丢弃掉
        html = html.substring(html.indexOf("</li>") + 5);
    }

    return dataList;
};

//获得li标签里面的数据
const getLiTab = function (h) {

    var data = {};

    //提取图片路径
    var srcArr = h.match(srcMath);
    if (!srcArr || srcArr.length < 2) return;
    var src = srcArr[1];
    data.Img = proxyImg + src;

    //提取新闻正文路径
    var hrefArr = h.match(hrefMath);
    if (!hrefArr || hrefArr.length < 2) return;
    var href = hrefArr[1];
    data.Href = "https://m.cnbeta.com" + href;

    //提取新闻描述数据
    var pArr = h.match(pMath);
    if (!pArr || pArr.length < 3) return;
    var detail = pArr[1];
    detail = detail.substring(detail.indexOf(">") + 1, detail.indexOf("</p>"));
    var detailSpan = detail.match(spanMath);
    if (detailSpan && detailSpan.length > 0) {
        var detail2 = detailSpan[0];
        detail = detail2.substring(detail2.indexOf(">") + 1, detail2.lastIndexOf("<"));
    }
    data.Detail = detail;

    //提取发布时间
    var iArr = pArr[2].match(iMath);
    if (!iArr || iArr.length != 2) return;
    var time = iArr[0];
    time = time.substring(time.indexOf("发布于") + 5, time.indexOf("</i>"));
    data.Time = time;

    //提取阅读数量
    var view = iArr[1];
    view = view.substring(view.indexOf("阅读") + 4, view.indexOf("</i>"));
    data.View = view;

    return data;
};

/**
 * cnbeta 新闻正文页面
 * 处理html为json
 * @param html
 * @returns {Array} 返回 JSON_list
 */
export const $page_htmlToJson = function (html) {

    var page = {}; //页数所有数据对象
    page.imgList = []; //把图片还放到单独的一个list,做预览时需要用到

    //丢弃掉前面一段不需要的数据
    {
        html = html.substring(html.indexOf("<article") - 1);
        html = html.substring(0, html.lastIndexOf("</article>") + 11);
    }

    //拿到标题
    {
        var header = html.substring(html.indexOf("<header"), html.indexOf("</header>") + 10);
        var title = header.substring(header.indexOf("article-tit") + 13, header.indexOf("</h1>"));
        page.title = title;
    }

    //拿到发布时间
    var time = header.substring(header.indexOf("<time") + 19, header.indexOf("</time>"));
    page.time = time;

    //拿到新闻发布作者
    var spanArr = header.match(spanMath);
    var anther = spanArr && spanArr.length > 0 ? spanArr[0] : "";
    anther = anther.replace("href=", "data-href=");
    page.anther = anther;

    //拿到评论数量
    var aArr = header.match(aMath);
    var commnum = aArr[0];
    commnum = commnum.substring(commnum.lastIndexOf("</i>") + 4, commnum.lastIndexOf("</a>"));
    page.commnum = commnum;

    //拿到阅读新闻点击量
    var articleSumm = html.substring(html.indexOf("article-summ") - 12);
    articleSumm = articleSumm.substring(articleSumm.indexOf("<p>") + 3, articleSumm.indexOf("</p>"));
    page.articleSumm = articleSumm;

    //下面截取新闻正文数据, 先拿到p标签
    page.contents = [];
    var artibody = html.substring(html.indexOf("id=\"artibody\"") - 25);
    artibody = artibody.substring(0, artibody.indexOf("class=\"cbv\"") - 7);
    var pArr = artibody.match(pMath);

    //循环p标签, 从每个p标签中拿到每一个段落的数据
    //先判断如果不是图片则直接拿到文本数据
    //如果是图片则返回图片路径
    for (var k in pArr) {
        var str = pArr[k] = pArr[k].replace("<p>", "").replace("</p>", "");
        var p = {};
        if (str.indexOf("<img") == -1) {
            p.t = "p";
            p.x = str;
        } else {
            p.t = "img";
            //默认给一个透明的图片
            p.x = "/static/img/touming.png";
            p.showImg = false;
            p.myClass = "p_class_" + k;
            str = str.substring(str.indexOf("<img"));
            str = str.substring(str.indexOf("src=") + 5);
            str = str.substring(0, str.indexOf("\""));
            p.orgImg = proxyImg + str;

            //把图片还放到单独的一个list,做预览时需要用到
            page.imgList.push(p.orgImg);
        }
        page.contents.push(p);
    }

    return page;
};


